Import Word Documents



In [1]:

    
# Set up path to files to be used for testing

test_data_path = "/Users/wbrierley/Documents/Jupyter Notebooks/Document Parsing Data/"
test_filename = "Brierley Bill Full CV 20180812.docx"
docxfile = test_data_path + test_filename

Word Documents are Archives of xml files



In [2]:

    
import zipfile



In [11]:

    
mydocx = zipfile.ZipFile(docxfile)
mydocx.namelist()









    Out[11]:





['[Content_Types].xml',
 '_rels/.rels',
 'word/_rels/document.xml.rels',
 'word/document.xml',
 'word/footnotes.xml',
 'word/endnotes.xml',
 'word/header1.xml',
 'word/footer1.xml',
 'word/theme/theme1.xml',
 'word/settings.xml',
 'word/_rels/settings.xml.rels',
 'word/webSettings.xml',
 'word/styles.xml',
 'word/numbering.xml',
 'docProps/core.xml',
 'word/fontTable.xml',
 'docProps/app.xml']



In [18]:

    
with mydocx.open('[Content_Types].xml') as component:
    print(component.read().getroot()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-18-11e1a26be2ff> in <module>
      1 with mydocx.open('[Content_Types].xml') as component:
----> 2     print(component.parse().getroot())

AttributeError: 'ZipExtFile' object has no attribute 'parse'

Processing the xml



In [ ]:

    
import xml.etree.ElementTree as ET



In [ ]:

    
def opendocx(file):
    '''Open a docx file, return a document XML tree'''
    mydoc = zipfile.ZipFile(file)
    xmlcontent = ET.parse(mydoc)
    #document = ET.fromstring(mydoc)
    return document



In [ ]:

    
doc_root = doc.getroot()



In [ ]: